package tfidf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;

public class ReadFiles {

	private static List<String> fileList = new ArrayList<String>();
	private static HashMap<String, HashMap<String, Float>> allTheTf = new HashMap<String, HashMap<String, Float>>();
	private static HashMap<String, HashMap<String, Integer>> allTheNormalTF = new HashMap<String, HashMap<String, Integer>>();

	public static List<String> readDirs(String filepath)
			throws FileNotFoundException, IOException {
		try {
			File file = new File(filepath);
			if (!file.isDirectory()) {
				System.out.println("输入的参数应该为[文件夹名]");
				System.out.println("filepath: " + file.getAbsolutePath());
			} else if (file.isDirectory()) {
				String[] filelist = file.list();
				for (int i = 0; i < filelist.length; i++) {
					File readfile = new File(filepath + "\\" + filelist[i]);
					if (!readfile.isDirectory()) {
						// System.out.println("filepath: " +
						// readfile.getAbsolutePath());
						fileList.add(readfile.getAbsolutePath());
					} else if (readfile.isDirectory()) {
						readDirs(filepath + "\\" + filelist[i]);
					}
				}
			}

		} catch (FileNotFoundException e) {
			System.out.println(e.getMessage());
		}
		return fileList;
	}

	public static String readFiles(String file) throws FileNotFoundException,
			IOException {
		StringBuffer sb = new StringBuffer();
		InputStreamReader is = new InputStreamReader(new FileInputStream(file),
				"gbk");
		BufferedReader br = new BufferedReader(is);
		String line = br.readLine();
		while (line != null) {
			sb.append(line).append("\r\n");
			line = br.readLine();
		}
		br.close();
		return sb.toString();
	}

	public static String getText(String filePath) throws FileNotFoundException,IOException 
	{
	
		InputStreamReader isReader =new InputStreamReader(new FileInputStream(filePath),"UTF-8");
		BufferedReader reader = new BufferedReader(isReader);
		String aline;
		StringBuilder sb = new StringBuilder();
	
		while ((aline = reader.readLine()) != null)
		{
			sb.append(aline + " ");
		}
		isReader.close();
		reader.close();
		return sb.toString();
	}	
	
	public static String[] cutWord(String file) throws IOException {
		String cutWordResult = "";
		StringReader reader = new StringReader(getText(file));
		Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
		TokenStream ts = analyzer.tokenStream("", reader);	
		while (ts.incrementToken()) {
			CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
			cutWordResult += ta.toString() + " ";
		}
		return cutWordResult.split(" ");
	}

	public static HashMap<String, Float> tf(String[] cutWordResult) {
		HashMap<String, Float> tf = new HashMap<String, Float>();// 正规化
		int wordNum = cutWordResult.length;
		int wordtf = 0;
		for (int i = 0; i < wordNum; i++) {
			wordtf = 0;
			for (int j = 0; j < wordNum; j++) {
				if (cutWordResult[i] != " " && i != j) {
					if (cutWordResult[i].equals(cutWordResult[j])) {
						cutWordResult[j] = " ";
						wordtf++;
					}
				}
			}
			if (cutWordResult[i] != " ") {
				tf.put(cutWordResult[i], (new Float(++wordtf)) / wordNum);
				cutWordResult[i] = " ";
			}
		}
		return tf;
	}

	public static HashMap<String, Integer> normalTF(String[] cutWordResult) {
		HashMap<String, Integer> tfNormal = new HashMap<String, Integer>();// 没有正规化
		int wordNum = cutWordResult.length;
		int wordtf = 0;
		for (int i = 0; i < wordNum; i++) {
			wordtf = 0;
			if (cutWordResult[i] != " ") {
				for (int j = 0; j < wordNum; j++) {
					if (i != j) {
						if (cutWordResult[i].equals(cutWordResult[j])) {
							cutWordResult[j] = " ";
							wordtf++;

						}
					}
				}
				tfNormal.put(cutWordResult[i], ++wordtf);
				cutWordResult[i] = " ";
			}
		}
		return tfNormal;
	}

	public static Map<String, HashMap<String, Float>> tfOfAll(String dir)
			throws IOException {
		List<String> fileList = ReadFiles.readDirs(dir);
		for (String file : fileList) {
			HashMap<String, Float> dict = new HashMap<String, Float>();
			dict = ReadFiles.tf(ReadFiles.cutWord(file));
			allTheTf.put(file, dict);
		}
		return allTheTf;
	}

	public static Map<String, HashMap<String, Integer>> NormalTFOfAll(String dir)
			throws IOException {
		List<String> fileList = ReadFiles.readDirs(dir);
		for (int i = 0; i < fileList.size(); i++) {
			HashMap<String, Integer> dict = new HashMap<String, Integer>();
			dict = ReadFiles.normalTF(ReadFiles.cutWord(fileList.get(i)));
			allTheNormalTF.put(fileList.get(i), dict);
		}
		return allTheNormalTF;
	}

	public static Map<String, Float> idf(String dir)
			throws FileNotFoundException, UnsupportedEncodingException,
			IOException {
		// 公式IDF＝log((1+|D|)/|Dt|)，其中|D|表示文档总数，|Dt|表示包含关键词t的文档数量。
		Map<String, Float> idf = new HashMap<String, Float>();
		List<String> located = new ArrayList<String>();

		float Dt = 1;
		float D = allTheNormalTF.size();// 文档总数
		List<String> key = fileList;// 存储各个文档名的List
		Map<String, HashMap<String, Integer>> tfInIdf = allTheNormalTF;// 存储各个文档tf的Map

		for (int i = 0; i < D; i++) {
			HashMap<String, Integer> temp = tfInIdf.get(key.get(i));
			for (String word : temp.keySet()) {
				Dt = 1;
				if (!(located.contains(word))) {
					for (int k = 0; k < D; k++) {
						if (k != i) {
							HashMap<String, Integer> temp2 = tfInIdf.get(key
									.get(k));
							if (temp2.keySet().contains(word)) {
								located.add(word);
								Dt = Dt + 1;
								continue;
							}
						}
					}
					idf.put(word, Log.log((1 + D) / Dt, 10));
				}
			}
		}
		return idf;
	}

	public static Map<String, HashMap<String, Float>> tfidf(String dir)
			throws IOException {

		Map<String, Float> idf = ReadFiles.idf(dir);
		Map<String, HashMap<String, Float>> tf = ReadFiles.tfOfAll(dir);

		for (String file : tf.keySet()) {
			Map<String, Float> singelFile = tf.get(file);
			for (String word : singelFile.keySet()) {
				singelFile.put(word, (idf.get(word)) * singelFile.get(word));
			}
		}
		return tf;
	}
}
